{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install nltk\n",
    "import nltk\n",
    "# nltk.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6\n",
      "**************************************************\n",
      "['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '.']\n",
      "[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('leading', 'VBG'), ('platform', 'NN'), ('for', 'IN'), ('building', 'VBG'), ('Python', 'NNP'), ('programs', 'NNS'), ('to', 'TO'), ('work', 'VB'), ('with', 'IN'), ('human', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('.', '.')]\n",
      "(S\n",
      "  (ORGANIZATION NLTK/NNP)\n",
      "  is/VBZ\n",
      "  a/DT\n",
      "  leading/VBG\n",
      "  platform/NN\n",
      "  for/IN\n",
      "  building/VBG\n",
      "  (PERSON Python/NNP)\n",
      "  programs/NNS\n",
      "  to/TO\n",
      "  work/VB\n",
      "  with/IN\n",
      "  human/JJ\n",
      "  language/NN\n",
      "  data/NNS\n",
      "  ./.)\n"
     ]
    }
   ],
   "source": [
    "#分句\n",
    "text = \"\"\"\n",
    "NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active discussion forum.\n",
    "\n",
    "Thanks to a hands-on guide introducing programming fundamentals alongside topics in computational linguistics, plus comprehensive API documentation, NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike. NLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free, open source, community-driven project.\n",
    "\n",
    "NLTK has been called “a wonderful tool for teaching, and working in, computational linguistics using Python,” and “an amazing library to play with natural language.”\n",
    "\"\"\"\n",
    "sentences = nltk.sent_tokenize(text)\n",
    "print(len(sentences))\n",
    "print('*'*50)\n",
    "sentence = sentences[0]\n",
    "\n",
    "# 分词\n",
    "tokens = nltk.word_tokenize(sentence)\n",
    "print(tokens)\n",
    "\n",
    "# 词性标注\n",
    "tagged = nltk.pos_tag(tokens)\n",
    "print(tagged)\n",
    "\n",
    "# 实体识别\n",
    "entities = nltk.chunk.ne_chunk(tagged)\n",
    "print(entities)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import treebank\n",
    "# t = treebank.parsed_sents('wsj_0001.mrg')[0]\n",
    "# t.draw()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S (NP Alice) (VP (V loves) (NP Bob)))\n"
     ]
    }
   ],
   "source": [
    "# 解析语法结构\n",
    "text15 = nltk.word_tokenize(\"Alice loves Bob\")\n",
    "grammar = nltk.CFG.fromstring(\"\"\"\n",
    "S -> NP VP\n",
    "VP -> V NP\n",
    "NP -> 'Alice' | 'Bob'\n",
    "V -> 'loves'\n",
    "\"\"\")\n",
    "parser = nltk.ChartParser(grammar)\n",
    "trees = parser.parse_all(text15)\n",
    "for tree in trees:\n",
    "    print(tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S (NP Alice) (VP (V loves) (NP Bob)))\n"
     ]
    }
   ],
   "source": [
    "#生成语法树\n",
    "parser = nltk.ChartParser(grammar)\n",
    "trees = parser.parse_all(text15)\n",
    "for tree in trees:\n",
    "    print(tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 词形还原"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "car\n",
      "men\n",
      "run\n",
      "eat\n",
      "sad\n",
      "fancy\n"
     ]
    }
   ],
   "source": [
    "from nltk.stem import WordNetLemmatizer\n",
    "wnl = WordNetLemmatizer()\n",
    "# lemmatize nouns\n",
    "print(wnl.lemmatize('cars', 'n'))\n",
    "print(wnl.lemmatize('men', 'n'))\n",
    "\n",
    "# lemmatize verbs\n",
    "print(wnl.lemmatize('running', 'v'))\n",
    "print(wnl.lemmatize('ate', 'v'))\n",
    "\n",
    "# lemmatize adjectives\n",
    "print(wnl.lemmatize('saddest', 'a'))\n",
    "print(wnl.lemmatize('fancier', 'a'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 词干提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "maximum\n",
      "maxim\n",
      "maximum\n"
     ]
    }
   ],
   "source": [
    "# 基于Porter词干提取算法\n",
    "from nltk.stem.porter import PorterStemmer  \n",
    "porter_stemmer = PorterStemmer()  \n",
    "print(porter_stemmer.stem('maximum'))\n",
    "\n",
    "# 基于Lancaster 词干提取算法\n",
    "from nltk.stem.lancaster import LancasterStemmer  \n",
    "lancaster_stemmer = LancasterStemmer()  \n",
    "print(lancaster_stemmer.stem('maximum'))\n",
    "\n",
    "# 基于Snowball 词干提取算法\n",
    "from nltk.stem import SnowballStemmer  \n",
    "snowball_stemmer = SnowballStemmer(\"english\")  \n",
    "print(snowball_stemmer.stem('maximum'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
